knitr::opts_chunk$set(
warning = TRUE, # show warnings during codebook generation
message = TRUE, # show messages during codebook generation
error = TRUE, # do not interrupt codebook generation in case of errors,
# usually better for debugging
echo = TRUE # show R code
)
ggplot2::theme_set(ggplot2::theme_bw())
pander::panderOptions("table.split.table", Inf)
# load libraries
library(codebook)
library(here)
library(dplyr)
library(tidyverse)
library(future)
library(labelled)
This is a data dictionary for the data set used in the paper “Are translation equivalents special? Evidence from simulations and empirical data from bilingual infants”.
#load dataset
keepers_ws_TE <- rio::import(here::here("data_keepers/keepers_ws_TE_final.csv"))
codebook_items(keepers_ws_TE)
The dataset has N = 200 participants and 229 datapoints.
keepers_ws_TE %>%
summarise(N_baby_id = n_distinct(baby_id),
N_ID_testdate = n_distinct(ID_testdate))
## N_baby_id N_ID_testdate
## 1 200 229
While the majority of participants made only 1 visit, some participants made multiple visits.
keepers_ws_TE %>%
group_by(multiple_visits) %>%
distinct(baby_id, .keep.all=T) %>%
count()
## # A tibble: 2 x 2
## # Groups: multiple_visits [2]
## multiple_visits n
## <lgl> <int>
## 1 FALSE 156
## 2 TRUE 44
keepers_ws_TE %>%
group_by(visit_num) %>%
distinct(ID_testdate) %>%
mutate(total = n()) %>%
count()
## # A tibble: 3 x 2
## # Groups: visit_num [3]
## visit_num n
## <int> <int>
## 1 1 189
## 2 2 38
## 3 3 2
keepers_ws_TE %>%
distinct(baby_id, .keep_all = TRUE) %>%
count(gender) %>%
mutate(percentage = round(n/sum(n)*100, 2))
## gender n percentage
## 1 F 94 47
## 2 M 106 53
keepers_ws_TE %>%
summarize(mean_age_continuous = mean(age_continuous, na.rm = T),
sd_age_continuous = sd(age_continuous, na.rm = T),
min_age_continuous = min(age_continuous, na.rm = T),
max_age_continuous = max(age_continuous, na.rm = T))
## mean_age_continuous sd_age_continuous min_age_continuous max_age_continuous
## 1 24.40808 4.70224 18.38 33.5
keepers_ws_TE %>%
ggplot(aes(age_continuous)) +
geom_histogram()
keepers_ws_TE %>%
summarize(mean_years_education = mean(years_education, na.rm = T),
sd_years_education = sd(years_education, na.rm = T),
min_years_education = min(years_education, na.rm = T),
max_years_education = max(years_education, na.rm = T))
## mean_years_education sd_years_education min_years_education
## 1 16.59633 2.136448 10
## max_years_education
## 1 21
keepers_ws_TE %>%
ggplot(aes(years_education)) +
geom_histogram()
All the data points included in the analysis are from bilinguals.
keepers_ws_TE %>%
mutate(lang_group = as.factor(lang_group)) %>%
count(lang_group) %>%
mutate(percentage = round(n/sum(n)*100, 2))
## lang_group n percentage
## 1 bilingual 229 100
keepers_ws_TE %>%
pivot_longer(c(lang_exp_eng, lang_exp_fre, lang_exp_other), names_to = "language", values_to = "lang_exp") %>%
group_by(language) %>%
summarize(mean_lang_exp = mean(lang_exp, na.rm = T),
sd_lang_exp = sd(lang_exp, na.rm = T),
min_lang_exp = min(lang_exp, na.rm = T),
max_lang_exp = max(lang_exp, na.rm = T))
## # A tibble: 3 x 5
## language mean_lang_exp sd_lang_exp min_lang_exp max_lang_exp
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 lang_exp_eng 51.7 14.8 25 75
## 2 lang_exp_fre 47.8 15.0 25 75
## 3 lang_exp_other 0.565 1.78 0 10
keepers_ws_TE %>%
pivot_longer(c(lang_exp_eng, lang_exp_fre, lang_exp_other), names_to = "language", values_to = "lang_exp") %>%
group_by(language) %>%
summarize(mean_lang_exp = mean(lang_exp, na.rm = T)) %>%
mutate(language = recode(language,
lang_exp_eng = "English",
lang_exp_fre = "French",
lang_exp_other = "Other language")) %>%
ggplot(aes(x = language, y = mean_lang_exp)) +
geom_bar(stat="identity") +
labs(x = "Language",
y = "Mean language exposure (%)")
keepers_ws_TE %>%
pivot_longer(c(lang_dom, lang_dom_vocab), names_to = "language_dominance", values_to = "language") %>%
group_by(language_dominance, language) %>%
summarize(n = n())
## # A tibble: 4 x 3
## # Groups: language_dominance [2]
## language_dominance language n
## <chr> <chr> <int>
## 1 lang_dom English 127
## 2 lang_dom French 102
## 3 lang_dom_vocab English 137
## 4 lang_dom_vocab French 92
keepers_ws_TE %>%
pivot_longer(c(lang_dom, lang_dom_vocab), names_to = "language_dominance", values_to = "language") %>%
group_by(language_dominance, language) %>%
summarize(n = n()) %>%
mutate(language_dominance = recode(language_dominance,
lang_dom = "Language dominance \n based on % language exposure",
lang_dom_vocab = "Language dominance \n based on vocabulary size")) %>%
ggplot(aes(x = language_dominance, y = n, fill = language)) +
geom_bar(stat="identity", position=position_dodge()) +
labs(x = "Types of language dominance",
y = "Number of participants")
For most children, the language in which they produced the most words was also the language that they heard most often, although this was not the case for some children.
## check to see how many children have consistent/inconsistent dominant language between vocabulary-defined and input-defined dominance
keepers_ws_TE %>%
mutate(consistent_lang_dom = if_else(lang_dom != lang_dom_vocab, 0, 1)) %>% # 1= consistent, 0 = inconsistent
summarise(n = n(),
n_consistent = sum(consistent_lang_dom),
percentage_consistent = n_consistent/n*100,
n_inconsistent = n - n_consistent,
percentage_inconsistent = n_inconsistent/n*100)
## n n_consistent percentage_consistent n_inconsistent percentage_inconsistent
## 1 229 181 79.0393 48 20.9607
Thus, these two constructs were related, although not identical.
keepers_ws_TE %>%
ggplot(aes(x = balance_vocab, y = lang_nondom_input)) +
stat_smooth(method = lm, se = F, color = "black") +
geom_point(shape = 1) +
theme_light() +
labs(x = "Balance based on vocabulary (BALANCE)",
y = "Balance based on exposure")
keepers_ws_TE %>%
# combine grandmother & other family member
mutate(cdi_filled_by = replace(cdi_filled_by, cdi_filled_by == "Grandmother", "Other family member")) %>%
# count number of respondents
count(cdi_filled_by) %>%
mutate(percentage = round(n/sum(n)*100, 2))
## cdi_filled_by n percentage
## 1 Father 15 6.55
## 2 Mother 146 63.76
## 3 Mother and Father 10 4.37
## 4 Other family member 2 0.87
## 5 <NA> 56 24.45
keepers_ws_TE %>%
pivot_longer(c(both_cdi_filled, required_cdi_filled, eng_cdi_filled, fre_cdi_filled, dom_cdi_filled, cdi_available),
names_to = "variable", values_to = "response") %>%
group_by(variable, response) %>%
summarize(n = n()) %>%
mutate(percentage = round(n/sum(n)*100, 2))
## # A tibble: 6 x 4
## # Groups: variable [6]
## variable response n percentage
## <chr> <chr> <int> <dbl>
## 1 both_cdi_filled Y 229 100
## 2 cdi_available bothFilled 229 100
## 3 dom_cdi_filled Y 229 100
## 4 eng_cdi_filled Y 229 100
## 5 fre_cdi_filled Y 229 100
## 6 required_cdi_filled Y 229 100
keepers_ws_TE %>%
pivot_longer(c(total_words_eng, total_words_fre, word_vocab, concept_vocab, number_of_te, eng_unique_words, fre_unique_words),
names_to = "vocab_type", values_to = "vocab_score") %>%
group_by(vocab_type) %>%
summarize(mean = mean(vocab_score, na.rm = T),
sd = sd(vocab_score, na.rm = T),
min = min(vocab_score, na.rm = T),
max = max(vocab_score, na.rm = T))
## # A tibble: 7 x 5
## vocab_type mean sd min max
## <chr> <dbl> <dbl> <int> <int>
## 1 concept_vocab 227. 181. 4 695
## 2 eng_unique_words 98.5 125. 1 523
## 3 fre_unique_words 61.2 80.6 0 399
## 4 number_of_te 67.7 85.1 1 409
## 5 total_words_eng 166. 177. 3 657
## 6 total_words_fre 129. 124. 2 532
## 7 word_vocab 295. 255. 6 1071
Instead of coding total number of words produced in English/French, codes for total number of words produced in dominant/non-dominant language defined by vocabulary size (i.e., lang_dom_vocab: the language with a greater vocabulary size is the dominant language)
keepers_ws_TE %>%
pivot_longer(c(total_words_dom, total_words_nondom, total_singlet_dom, total_singlet_nondom, singlet_vocab),
names_to = "vocab_type", values_to = "vocab_score") %>%
group_by(vocab_type) %>%
summarize(mean = mean(vocab_score, na.rm = T),
sd = sd(vocab_score, na.rm = T),
min = min(vocab_score, na.rm = T),
max = max(vocab_score, na.rm = T))
## # A tibble: 5 x 5
## vocab_type mean sd min max
## <chr> <dbl> <dbl> <int> <int>
## 1 singlet_vocab 160. 124. 2 525
## 2 total_singlet_dom 138. 124. 2 523
## 3 total_singlet_nondom 21.2 20.1 0 94
## 4 total_words_dom 206. 176. 4 657
## 5 total_words_nondom 88.9 98.5 2 469
keepers_ws_TE %>%
summarize(mean_balance_vocab = mean(balance_vocab, na.rm = T),
sd_balance_vocab = sd(balance_vocab, na.rm = T),
min_balance_vocab = min(balance_vocab, na.rm = T),
max_balance_vocab = max(balance_vocab, na.rm = T))
## mean_balance_vocab sd_balance_vocab min_balance_vocab max_balance_vocab
## 1 0.3083577 0.1250181 0.02380952 0.496063
keepers_ws_TE %>%
ggplot(aes(balance_vocab)) +
geom_histogram()
Balance can also be considered in terms of input in each language. To make balance_vocab and balance_input comparable, the language designated as DOM and NONDOM was based on vocabulary-defined dominance (lang_dom_vocab), rather than the language that children heard most and least often.
keepers_ws_TE %>%
summarize(mean_balance_input = mean(balance_vocab, na.rm = T),
sd_balance_input = sd(balance_vocab, na.rm = T),
min_balance_input = min(balance_vocab, na.rm = T),
max_balance_input = max(balance_vocab, na.rm = T))
## mean_balance_input sd_balance_input min_balance_input max_balance_input
## 1 0.3083577 0.1250181 0.02380952 0.496063
keepers_ws_TE %>%
ggplot(aes(balance_input)) +
geom_histogram()
keepers_ws_TE %>%
distinct(age_months_percentile, .keep_all = TRUE) %>%
pivot_longer(c(EngWS_90percentile, FrWS_90percentile), names_to = "language", values_to = "words_90percentile") %>%
mutate(language = recode(language,
EngWS_90percentile = "English",
FrWS_90percentile = "French")) %>%
ggplot(aes(x = age_months_percentile, y = words_90percentile, fill = language)) +
geom_bar(stat="identity", position=position_dodge()) +
labs(x = "Age in months adjusted for the CDI-WS age range (18-30m)",
y = "Number of words produced at the 90th percentile \n (obtained from Wordbank)")